import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
beverage_df= pd.read_csv('data.csv')
print(beverage_df)
ID fixed acidity volatile acidity citric acid residual sugar \
0 1 7.4 0.700 0.00 1.9
1 2 7.8 0.880 0.00 2.6
2 3 7.8 0.760 0.04 2.3
3 4 11.2 0.280 0.56 1.9
4 5 7.4 0.700 0.00 1.9
... ... ... ... ... ...
1596 1597 5.9 0.550 0.10 2.2
1597 1598 6.3 0.510 0.13 2.3
1598 1599 5.9 0.645 0.12 2.0
1599 1600 6.0 0.310 0.47 3.6
1600 1601 7.2 0.390 0.44 2.6
chlorides free sulfur dioxide total sulfur dioxide density pH \
0 0.076 11.0 34.0 0.99780 3.51
1 0.098 25.0 67.0 0.99680 3.20
2 0.092 15.0 54.0 0.99700 NaN
3 0.075 17.0 60.0 0.99800 3.16
4 0.076 11.0 34.0 0.99780 3.51
... ... ... ... ... ...
1596 0.062 39.0 51.0 0.99512 3.52
1597 0.076 29.0 40.0 0.99574 3.42
1598 0.075 32.0 44.0 0.99547 3.57
1599 0.067 18.0 42.0 0.99549 3.39
1600 0.066 22.0 48.0 0.99494 3.30
sulphates alcohol quality
0 0.56 9.4 5.0
1 0.68 9.8 5.0
2 0.65 9.8 5.0
3 0.58 9.8 6.0
4 0.56 9.4 5.0
... ... ... ...
1596 0.76 11.2 6.0
1597 0.75 11.0 6.0
1598 0.71 10.2 5.0
1599 0.66 11.0 6.0
1600 0.84 11.5 6.0
[1601 rows x 13 columns]
beverage_df.duplicated()
0 False
1 False
2 False
3 False
4 False
...
1596 False
1597 False
1598 False
1599 False
1600 False
Length: 1601, dtype: bool
beverage_df.duplicated().sum()
0
beverage_df1 = beverage_df.drop_duplicates()
beverage_df1.shape
(1601, 13)
beverage_df.isnull()
| ID | fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1596 | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 1597 | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 1598 | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 1599 | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 1600 | False | False | False | False | False | False | False | False | False | False | False | False | False |
1601 rows × 13 columns
beverage_df.isnull().sum()
ID 0 fixed acidity 1 volatile acidity 0 citric acid 1 residual sugar 0 chlorides 3 free sulfur dioxide 1 total sulfur dioxide 1 density 1 pH 1 sulphates 0 alcohol 0 quality 1 dtype: int64
updatedbeverage_df = beverage_df.dropna()
updatedbeverage_df.isnull().sum()
ID 0 fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
beverage_df=updatedbeverage_df.dropna()
beverage_df.describe()
| ID | fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 | 1591.000000 |
| mean | 801.060968 | 8.320616 | 0.527681 | 0.271477 | 2.540761 | 0.087500 | 15.861722 | 46.498429 | 0.996747 | 3.311358 | 0.658611 | 10.426053 | 5.637964 |
| std | 460.740942 | 1.740134 | 0.179045 | 0.194807 | 1.413076 | 0.047175 | 10.429385 | 32.998659 | 0.001889 | 0.154548 | 0.169786 | 1.066706 | 0.807772 |
| min | 1.000000 | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 402.500000 | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 801.000000 | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 1199.500000 | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997845 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 1601.000000 | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
beverage_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1591 entries, 0 to 1600 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 1591 non-null int64 1 fixed acidity 1591 non-null float64 2 volatile acidity 1591 non-null float64 3 citric acid 1591 non-null float64 4 residual sugar 1591 non-null float64 5 chlorides 1591 non-null float64 6 free sulfur dioxide 1591 non-null float64 7 total sulfur dioxide 1591 non-null float64 8 density 1591 non-null float64 9 pH 1591 non-null float64 10 sulphates 1591 non-null float64 11 alcohol 1591 non-null float64 12 quality 1591 non-null float64 dtypes: float64(12), int64(1) memory usage: 174.0 KB
beverage_df = beverage_df.rename(columns={'fixed acidity':'fixed_acidity',
'volatile acidity':'volatile_acidity',
'citric acid':'citric_acid',
'residual sugar':'residual_sugar',
'free sulfur dioxide':'free_sulfur_dioxide',
'total sulfur dioxide':'total_sulfur_dioxide'})
beverage_df.columns
Index(['ID', 'fixed_acidity', 'volatile_acidity', 'citric_acid',
'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol',
'quality'],
dtype='object')
X = beverage_df['alcohol']
Y = beverage_df['quality']
plt.scatter(X, Y)
plt.title('Scatter Plot of Alcohol Content vs Quality of Beverage')
plt.xlabel('Alcohol Content')
plt.ylabel('Quality')
plt.show()
METHOD USED:
PANDAS AND YDATA_PROFILING WHICH ARE PRE-CREATED LIBRARIES WERE INSTALLED AND IMPORTED INTO JUPYTER NOTEBOOK.
THESE PROFILING LIBRARIES GIVE DETAILED STATISTICAL INFERENCES OF OUR DATA BY PROVIDING HISTOGRAMS,HEATMAPS AND CORRELATION DATA.
from pandas_profiling import ProfileReport
profile = ProfileReport(beverage_df, title="Data Report")
C:\Users\Damian Yawuliga\AppData\Local\Temp\ipykernel_4632\1269014081.py:1: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead. from pandas_profiling import ProfileReport
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]